t1 <- Sys.time()
EH23a <- read.csv("../FigureSideo/EH23a.softmasked_nuccomp.csv")
EH23a$chrom_num <- sub(".+chr", "", EH23a$Id)
EH23a$chrom_num[ EH23a$chrom_num == "X" ] <- 10
EH23a$chrom_num <- as.numeric(EH23a$chrom_num)
EH23a[1:3, ]
EH23a[, 1:2]
EH23b <- read.csv("../FigureSideo/EH23b.softmasked_nuccomp.csv")
EH23b$chrom_num <- sub(".+chr", "", EH23b$Id)
EH23b$chrom_num[ EH23b$chrom_num == "X" ] <- 10
EH23b$chrom_num <- as.numeric(EH23b$chrom_num)
EH23b[1:3, ]
# File > 50 MB compressed.
syri <- read.table("/media/knausb/E737-9B48/knausb/mm2_maps/EH23a_EH23b/EH23a_EH23brcsyri.out",
sep = "\t", na.strings = "-")
names(syri) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
#syri$ref_start[is.na(as.numeric(syri$ref_start))]
syri$ref_start <- as.numeric(syri$ref_start)
syri$ref_end <- as.numeric(syri$ref_end)
syri$query_start <- as.numeric(syri$query_start)
syri$query_end <- as.numeric(syri$query_end)
syri[1:3, ]
sort(table(syri$Annotation_Type), decreasing = TRUE)
syn <- syri[syri[, 11] == "SYN", ]
syn[1:3, ]
nrow(syn)
last_win <- 1
#coalesce <- 5e4
#coalesce <- 1e5
coalesce <- 4e5
#coalesce <- 1e6
for( j in 2:nrow(syn) ){
if( syn$ref_start[j] - syn$ref_end[last_win] < coalesce &
syn$query_start[j] - syn$query_end[last_win] < coalesce &
syn$ref_chrom[j] == syn$ref_chrom[last_win] ){
syn$ref_end[ last_win ] <- syn$ref_end[j]
syn$query_end[ last_win ] <- syn$query_end[j]
#syn$ref_end[j] <- NA
syn$unique_ID[j] <- NA
} else {
last_win <- j
}
}
#syn <- syn[ !is.na( syn$ref_end ), ]
syn <- syn[ !is.na( syn$unique_ID ), ]
syn[1:3, ]
nrow(syn)
min(syn$ref_end - syn$ref_start)
hist(syn$ref_end - syn$ref_start)
sort(syn$ref_end - syn$ref_start, decreasing = T)[1:10]
syn[1:3, ]
inv <- syri[syri$Annotation_Type == "INV", ]
dup <- syri[syri$Annotation_Type == "DUP", ]
dup[1:3, ]
nrow(dup)
trans <- syri[syri$Annotation_Type == "TRANS", ]
trans[1:3, ]
nrow(trans)
# nrow(syn)
# x <- syn[1:1000, ]
# nsmooth <- 10
# min_len <- 1e5
#gribbon <- function(x, nsmooth = 50, min_len = 1e5){
gribbon <- function(
x,
nsmooth = 10,
min_len = 1e5,
coalesce = 0,
invert = FALSE){
x <- x[(x$ref_end - x$ref_start) >= min_len, ]
x <- x[(x$query_end - x$query_start) >= min_len, ]
if( nrow(x) <= 0 ){
return("No rows")
}
# if( coalesce > 0 ){
# last_win <- 1
# for( j in 2:nrow(x) ){
# if( x$ref_start[j] - x$ref_end[last_win] < coalesce &
# x$query_start[j] - x$query_end[last_win] < coalesce ){
# x$ref_end[ last_win ] <- x$ref_end[j]
# x$query_end[ last_win ] <- x$query_end[j]
# x$ref_end[j] <- NA
# } else {
# last_win <- j
# }
# }
# x <- x[ !is.na( x$ref_end ), ]
# }
if( invert ){
tmp <- x$query_start
x$query_start <- x$query_end
x$query_end <- tmp
}
# Set a chromosome number.
x$ref_chrom_num <- sub(".+chr", "", x$ref_chrom)
x$ref_chrom_num[ x$ref_chrom_num == "X" ] <- 10
x$ref_chrom_num[ x$ref_chrom_num == "Y" ] <- 11
x$ref_chrom_num <- as.numeric(x$ref_chrom_num)
x$ref_chrom_num <- x$ref_chrom_num - 0.2
x$query_chrom_num <- sub(".+chr", "", x$query_chrom)
x$query_chrom_num[ x$query_chrom_num == "X" ] <- 10
x$query_chrom_num[ x$query_chrom_num == "Y" ] <- 11
x$query_chrom_num <- as.numeric(x$query_chrom_num)
x$query_chrom_num <- x$query_chrom_num + 0.2
# nrow(x)
my_x <- seq( from = -4, to = 4, length.out = nsmooth)
my_y <- 1/( 1 + exp(1)^-my_x)
my_x <- (my_x + 4)/8
# plot(my_x, my_y)
# my_x <- c(my_x, rev(my_x))
my_polys <- data.frame(
ID = rep(x$unique_ID, each = nsmooth * 2),
ref_chrom_num = rep(x$ref_chrom_num, each = nsmooth * 2),
# ref_start = rep(NA, each = nsmooth * 2),
# ref_end = rep(NA, each = nsmooth * 2),
query_chrom_num = rep(x$query_chrom_num, each = nsmooth * 2),
# query_start = rep(NA, each = nsmooth * 2),
# query_end = rep(NA, each = nsmooth * 2)
poly_xs = rep(NA, each = nsmooth * 2),
poly_ys = rep(NA, each = nsmooth * 2)
)
my_polys[1:3, ]
my_vs <- seq(1, nrow(my_polys) + 1, by = nsmooth * 2)
# for( i in seq(1, nrow(my_polys), by = nsmooth * 2) ){
for( i in 1:nrow(x) ){
my_index <- my_vs[i]:(my_vs[i+1] - 1)
xmins <- seq( x$ref_chrom_num[i], x$query_chrom_num[i], length.out = nsmooth)
# my_y * (x$query_end[i] - x$query_start[i]) + x$query_start[i]
ymins <- my_y * (x$query_start[i] - x$ref_start[i]) + x$ref_start[i]
ymaxs <- my_y * (x$query_end[i] - x$ref_end[i]) + x$ref_end[i]
my_polys$poly_xs[my_index] <- c(xmins, rev(xmins))
my_polys$poly_ys[my_index] <- c(ymins, rev(ymaxs))
#x$ref_start[ my_index ] <- seq(x$ref_chrom[i], x$query_chrom[i], length.out = nsmooth)
}
# my_polys[1:3, ]
return( my_polys )
}
#poly_coords <- gribbon(syn[1:1000, ])
#poly_coords <- gribbon(syn, min_len = 1e5)
#
nrow(syn)
## [1] 40
poly_coords <- gribbon(syn, min_len = 1e1, coalesce = 0)
#poly_coords <- gribbon(syn, min_len = 1e3)
# poly_coords <- gribbon(syn, nsmooth = 40, min_len = 1e3)
# poly_coords <- gribbon(syn, min_len = 1e1)
length(unique(poly_coords$ID))
## [1] 40
poly_coords_inv <- gribbon(inv, min_len = 1e4, invert = TRUE)
poly_coords_dup <- gribbon(dup, min_len = 1e4)
poly_coords_trans <- gribbon(trans, min_len = 1e4)
cmd <- "~/gits/hempy/bin/fast_win.py /media/knausb/Vining_lab/knausb/mm2_maps/EH23a_EH23b_v2/EH23b.rc458X.softmasked.fasta.gz --win_size 1000000"
#system(cmd)
# sampn <- "EH23a"
# sampn <- "EH23b"
get_wins <- function( sampn ){
# sampd <- "../FigureSideo/EH23a.softmasked_wins.csv"
wins_dir <- "../FigureSideo/"
gffs_dir <- "/media/knausb/E737-9B48/releases/scaffolded/"
# sampn <- unlist(lapply(strsplit(sampd, split = "/"), function(x){x[length(x)]}))
# sampn <- unlist(lapply(strsplit(sampd, split = "//"), function(x){x[length(x)]}))
# sampn <- sub(".softmasked_wins.csv", "", sampn)
# sub(paste(sampn, ".softmasked_wins.csv", sep = ""), "", sampd)
if( sampn == "EH23b" ){
wins <- read.csv( "EH23b.rc458X.softmasked_wins.csv" )
wins$Id <- sub("EH23a", "EH23b", wins$Id)
} else {
wins <- read.csv( paste( wins_dir, sampn, ".softmasked_wins.csv", sep = "") )
}
# wins <- read.csv( paste( sampn, ".softmasked_wins.csv", sep = "") )
# wins <- read.csv( sampd )
# wins <- wins[ grep( paste(sampn, ".chr", sep = ""), wins$Id ), ]
wins$chrn <- sub(".+chr", "", wins$Id)
wins$chrn[ wins$chrn == "X" ] <- 10
wins$chrn[ wins$chrn == "Y" ] <- 11
wins$chrn <- as.numeric(wins$chrn)
#wins[1:3, ]
# Scale and center.
# Scaline of CG windows (chromosome width) is on a per chromosome basis.
# wins$CGs <- 0
wins$CGs <- wins$CG / wins$Win_length * 100
# wins$notCG <- (wins$Win_length / 1) - wins$CG
# wins$notCGs <- 0
for( j in unique(wins$Id) ){
wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j] - min(wins$CGs[ wins$Id == j], na.rm = TRUE)
wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j] / max(wins$CGs[ wins$Id == j], na.rm = TRUE)
#
# wins$CGs[ wins$Id == j] <- wins$CG[ wins$Id == j] - min(wins$CG[ wins$Id == j], na.rm = TRUE)
# wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j]/max(wins$CGs[ wins$Id == j], na.rm = TRUE)
#
# wins$notCGs[ wins$Id == j] <- wins$notCG[ wins$Id == j] - min(wins$notCG[ wins$Id == j], na.rm = TRUE)
# wins$notCGs[ wins$Id == j] <- wins$notCGs[ wins$Id == j]/max(wins$notCGs[ wins$Id == j], na.rm = TRUE)
}
wins$iCGs <- 1 - wins$CGs
#wins$ATs <- 1 - wins$CGs
#wins$ATs <- wins$Win_length/1e6 - wins$CGs
# genes <- read.table(
# paste(sampd, "/", sampn, ".primary_high_confidence.gff3.gz", sep = "" ),
# sep = "\t" )
genes <- read.table(
paste(gffs_dir, sampn, "/", sampn, ".primary_high_confidence.gff3.gz", sep = "" ),
sep = "\t" )
genes <- genes[genes[, 3] == "gene", ]
if( sampn == "EH23b" ){
# Column 4 is starts.
genes[ genes[, 1] == "EH23b.chr4", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr4" ] - genes[ genes[, 1] == "EH23b.chr4", 4]
genes[ genes[, 1] == "EH23b.chr4", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr4" ] - genes[ genes[, 1] == "EH23b.chr4", 5]
genes[ genes[, 1] == "EH23b.chr5", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr5" ] - genes[ genes[, 1] == "EH23b.chr5", 4]
genes[ genes[, 1] == "EH23b.chr5", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr5" ] - genes[ genes[, 1] == "EH23b.chr5", 5]
genes[ genes[, 1] == "EH23b.chr8", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr8" ] - genes[ genes[, 1] == "EH23b.chr8", 4]
genes[ genes[, 1] == "EH23b.chr8", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr8" ] - genes[ genes[, 1] == "EH23b.chr8", 5]
genes[ genes[, 1] == "EH23b.chrX", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chrX" ] - genes[ genes[, 1] == "EH23b.chrX", 4]
genes[ genes[, 1] == "EH23b.chrX", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chrX" ] - genes[ genes[, 1] == "EH23b.chrX", 5]
wins[1:3, ]
}
genes[1:3, ]
# Windowize
wins$gcnt <- 0
for(i in 1:nrow(wins)){
# if( sampn == "EH23b" ){
# wins$Id <- sub("EH23a", "EH23b", wins$Id)
# }
tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
wins$gcnt[i] <- nrow(tmp)
}
# Scaling of gene count windows is on a per assembly basis.
wins$gcntsc <- wins$gcnt - min(wins$gcnt)
wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
my_index <- round( wins$gcntsc * 100 )
my_index[ my_index <= 0] <- 1
# Color ramp.
#wins$gcol <- heat.colors(n=100)[ my_index ]
#wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]
#wins$gcol <- viridisLite::plasma(n = 100, alpha = 1, begin = 0.1, end = 1)[ my_index ]
#wins$gcol <- viridisLite::magma(n = 100, alpha = 1, begin = 0.2, end = 1.00)[ my_index ]
wins$gcol <- viridisLite::magma(n = 100, alpha = 1, begin = 0.1, end = 0.9)[ my_index ]
return(wins)
}
EH23a_wins <- get_wins("EH23a")
EH23a_wins[1:3, ]
EH23b_wins <- get_wins("EH23b")
EH23b_wins[1:3, ]
Windowize SNPs
# EH23a_wins[1:3, ]
#
# snps <- syri[syri$Annotation_Type == "SNP", ]
# nrow(snps)
# snps[1:3, ]
#
# EH23a_wins$snps <- 0
#
# for(i in 1:nrow(EH23a_wins)){
# tmp <- snps[ snps$ref_chrom == EH23a_wins$Id[i] & snps$ref_start >= EH23a_wins$Start[i] & snps$ref_end <= EH23a_wins$End[i], ]
# EH23a_wins$snps[i] <- nrow(tmp)
# }
# EH23a_wins$snpssc <- (EH23a_wins$snps - min(EH23a_wins$snps))/max(EH23a_wins$snps)
#
# hist(EH23a_wins$snps)
# hist(EH23a_wins$snps[ EH23a_wins$Id == "EH23a.chr4" ])
# my_hist <- hist(EH23a_wins$snpssc * 9 + 1, plot = F)
#
# #barplot(my_hist$density, space = 0, col = gray.colors(n=10))
# #axis(side=1)
# barplot(my_hist$density ~ my_hist$breaks[-10], space = 0, col = gray.colors(n=10))
sampn <- "EH23a"
ablstn <- read.csv(
paste("/media/knausb/Vining_lab/knausb/blast_projects/todd_rrna/blastn_uc/", sampn, "_blastn.csv", sep = ""),
header = FALSE)
colnames(ablstn) <- c("qseqid","qlen","sseqid","slen","qstart","qend",
"sstart","send","evalue","bitscore","score","length",
"pident","nident","mismatch","positive","gapopen",
"gaps","ppos","sstrand")
ablstn <- ablstn[grep("chr", ablstn$sseqid), ]
ablstn$chrn <- sub(".*chr", "", ablstn$sseqid)
ablstn$chrn[ablstn$chrn == "X"] <- 10
ablstn$chrn[ablstn$chrn == "Y"] <- 11
ablstn$chrn <- as.numeric(ablstn$chrn)
sampn <- "EH23b"
bblstn <- read.csv(
paste("/media/knausb/Vining_lab/knausb/blast_projects/todd_rrna/blastn_uc/", sampn, "_blastn.csv", sep = ""),
header = FALSE)
colnames(bblstn) <- c("qseqid","qlen","sseqid","slen","qstart","qend",
"sstart","send","evalue","bitscore","score","length",
"pident","nident","mismatch","positive","gapopen",
"gaps","ppos","sstrand")
bblstn <- bblstn[grep("chr", bblstn$sseqid), ]
bblstn$chrn <- sub(".*chr", "", bblstn$sseqid)
bblstn$chrn[bblstn$chrn == "X"] <- 10
bblstn$chrn[bblstn$chrn == "Y"] <- 11
bblstn$chrn <- as.numeric(bblstn$chrn)
my_chrom <- "EH23b.chr4"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chr5"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chr8"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chrX"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
sampn <- "EH23b"
my_obs <- ls()
#save( list = my_obs, file = "my_obs.RData" )
#load( file = "my_obs.RData" )
range(c(EH23a_wins$gcnt, EH23b_wins$gcnt))
## [1] 0 156
#hist(EH23a_wins$gcnt)
#range(EH23a_wins$gcnt)
#range(EH23a_wins$gcnt/max(EH23a_wins$gcnt) * 100)
new_col <- EH23a_wins$gcnt
new_col <- new_col/max(c(EH23a_wins$gcnt, EH23b_wins$gcnt)) * 99 + 1
new_col <- as.integer(new_col)
#range(new_col)
#length(new_col)
#new_col <- viridisLite::magma(n=100, begin = 0, end = 1)[new_col]
#new_col <- viridisLite::magma(n=100, begin = 0.1, end = 1.0)[new_col]
new_col <- viridisLite::magma(n=100, begin = 0.1, end = 0.9)[new_col]
#length(new_col)
EH23a_wins$gcol <- new_col
new_col <- EH23b_wins$gcnt
new_col <- new_col/max(c(EH23a_wins$gcnt, EH23b_wins$gcnt)) * 99 + 1
new_col <- as.integer(new_col)
#range(new_col)
#length(new_col)
#new_col <- viridisLite::magma(n=100, begin = 0, end = 1)[new_col]
#new_col <- viridisLite::magma(n=100, begin = 0.1, end = 1.0)[new_col]
new_col <- viridisLite::magma(n=100, begin = 0.1, end = 0.9)[new_col]
#length(new_col)
EH23b_wins$gcol <- new_col
library(ggplot2)
chrom_wid <- 0.05
p <- ggplot2::ggplot()
p <- p + theme_bw()
p <- p + ggplot2::geom_rect(
data = EH23a,
ggplot2::aes( xmin = chrom_num - chrom_wid - 0.2,
xmax = chrom_num + chrom_wid - 0.2,
ymin = 1, ymax = Length),
fill = "#DCDCDC",
color = "#000000"
)
p <- p + ggplot2::geom_rect(
data = EH23b,
ggplot2::aes( xmin = chrom_num - chrom_wid + 0.2,
xmax = chrom_num + chrom_wid + 0.2,
ymin = 1, ymax = Length),
fill = "#DCDCDC",
color = "#000000"
)
# EH23a_wins[1:3, ]
my_cols <- gray.colors(n=10, start = 0.3, end = 0.9)[round(EH23a_wins$snpssc * 9 + 1)]
#my_cols <- gray.colors(n=100, start = 0.3, end = 0.9)[round(EH23a_wins$snpssc * 99 + 1)]
#p <-
# p + ggplot2::geom_rect(
# data = EH23a_wins,
# ggplot2::aes( xmin = chrn - 0.5,
# xmax = chrn - 0.25,
# ymin = Start, ymax = End),
# # fill = "#DCDCDC",
# fill = my_cols,
# # color = "#000000"
# )
# Theme
p <- p + ggplot2::scale_x_continuous(
breaks = EH23a$chrom_num,
# limits = c(0.6, 10.4),
# limits = c(0.5, 10.5),
# labels = EH23a$Id
labels = sub("a.chr", ".chr", EH23a$Id)
)
p <- p + scale_y_continuous(
breaks = seq( 0, 120e6, by = 10e6),
labels = seq( 0, 120, by = 10)
)
# p <- p + ggplot2::theme_bw() +
p <- p + ggplot2::theme(
# panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.minor.x = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 ),
axis.text.x = element_text(angle = 60, hjust = 1),
axis.title.x=element_blank(),
panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
)
p <- p + ggplot2::ylab("Position (Mbp)")
p <- p + ggtitle( "EH23" )
#p
#p <- p + theme(legend.position='none')
p <- p + geom_polygon(
data = poly_coords,
aes(x = poly_xs, y = poly_ys, fill = ID, group = ID), #, fill = col),
alpha = 2/5,
show.legend = FALSE,
#color = NA,
# color = "#80808022",
# color = "#000000",
color = "#696969",
# color = "#808080",
# color = "#A9A9A9",
# color = "#C0C0C0",
linewidth = 0.4,
# fill = ID
# fill = "#C0C0C044"
fill = "#808080"
)
#p
p <- p + geom_polygon(
data = poly_coords_inv,
aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
alpha = 0.9,
show.legend = FALSE,
#color = NA,
# color = "#80808022",
#color = "#000000",
# linewidth = 0.1,
# fill = ID
# fill = "#C0C0C044"
fill = "#FFA500"
# fill = "#808080"
)
#p
# p + geom_polygon(
# data = poly_coords_dup,
# aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
# alpha = 0.9,
# show.legend = FALSE,
# fill = "#1E90FF"
# )
# p + geom_polygon(
# data = poly_coords_trans,
# aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
# alpha = 0.9,
# show.legend = FALSE,
# fill = "#1E90FF"
# )
thinw <- 0.28
p <- p + ggplot2::geom_rect(
data = EH23a_wins,
ggplot2::aes(
xmin = chrn - iCGs * thinw - 0.2,
#xmax = chrn + iCGs * thinw,
xmax = chrn - 0.2,
ymin = Start,
ymax = End),
fill = EH23a_wins$gcol,
color = NA
)
p <- p + ggplot2::geom_rect(
data = EH23b_wins,
ggplot2::aes(
#xmin = chrn - iCGs * thinw - 0.2,
xmin = chrn + 0.2,
xmax = chrn + iCGs * thinw + 0.2,
#xmax = chrn - 0.2,
ymin = Start,
ymax = End),
fill = EH23b_wins$gcol,
color = NA
)
p
#p + xlim(3.5, 4.5) #+ ylim(1e6, 2e6)
p + xlim(5.5, 8.5) + ylim(10e6, 40e6)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 10 rows containing missing values (`geom_rect()`).
## Removed 10 rows containing missing values (`geom_rect()`).
## Warning: Removed 654 rows containing missing values (`geom_rect()`).
## Warning: Removed 653 rows containing missing values (`geom_rect()`).
#pEH23 <- p
#save(pEH23, file = "ideo.RData")
#load("ideo.RData")
#pEH23
#p + theme(legend.text = element_text("Gene"), legend.position = "right")
#p + annotate("rect", xmin = 1, xmax = 4, ymin = seq(1, 80e6, by = ), ymax = 1e7,
# alpha = .4,fill = "blue")
rheight <- 500e3
nrect <- 99
my_rect <- data.frame(
#xmin = rep(10.85, times = 10),
xmin = rep(10.90, times = 10),
xmax = rep(11.4, times = 10),
#xmax = rep(11.6, times = 10),
ymin = 0:nrect * rheight + 1,
ymax = 1:(nrect+1) * rheight
)
#my_rect$fill <- viridisLite::magma(n=nrow(my_rect), begin = 0.1, end = 1.0)
my_rect$fill <- viridisLite::magma(n=nrow(my_rect), begin = 0.1, end = 0.9)
my_rect$ymin <- my_rect$ymin + 15e6
my_rect$ymax <- my_rect$ymax + 15e6
#my_rect[1:3, ]
# p + annotate("rect", xmin = my_rect$xmin, xmax = my_rect$xmax,
# ymin = my_rect$ymin, ymax = my_rect$ymax,
# alpha = 1.0,fill = my_rect$fill)
p2 <- p + annotate("rect",
xmin = min(my_rect$xmin) - 0.05, xmax = max(my_rect$xmax) + 0.05,
ymin = min(my_rect$ymin) - 1e6, ymax = max(my_rect$ymax) + 1e6,
alpha = 1.0,
#fill = "#ffffff",
#fill = "#F5F5F5",
fill = "#DCDCDC",
#fill = "#C0C0C0",
color = "#000000") +
coord_cartesian(xlim = c(0.75, 10.25), # This focuses the x-axis on the range of interest
clip = 'off') + # This keeps the labels from disappearing
theme(plot.margin = unit(c(1,3,1,1), "lines"))
#p2
p2 <- p2 + annotate("rect",
xmin = my_rect$xmin, xmax = my_rect$xmax,
ymin = my_rect$ymin, ymax = my_rect$ymax,
alpha = 1.0,fill = my_rect$fill)
# p2 <- p2 + annotate("text", x = 11.65, y = 38e6,
# label = "Genes per 1Mbp window",
# angle = 270, fontface = 1, size = 10/.pt)
p2 <- p2 + annotate("text", x = 11.65, y = 39e6,
label = "Genes per 1Mbp window",
angle = 270, fontface = 1, size = 10/.pt)
p2 <- p2 + annotate("text", x = 11.15, y = 69e6, label = "156",
angle = 0, fontface = 1, size = 10/.pt)
#p2 <- p2 + annotate("text", x = 11.65, y = 65e6, label = "156",
# angle = 0, fontface = 1, size = 10/.pt)
#p2 + annotate("text", x = 11.3, y = 2e6, label = "0", angle = 0, color="#ffffff", fontface = 2)
#p2 + annotate("text", x = 11.3, y = -2e6, label = "0", angle = 0, fontface = 2)
p2 <- p2 + annotate("text", x = 11.15, y = 11e6, label = "0",
angle = 0, fontface = 1, size = 10/.pt)
p2
p <- p2
# round(seq(1, 156, length.out=100))
# Cannabinoids
my_rows <- grep("AB2|LY|NC_044376.1:c427", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - blstw,
xmax = chrn + 0 - 0.14,
ymin = sstart,
ymax = send),
fill = "#228B22",
color = '#228B22'
)
my_rows <- grep("AB2|LY|NC_044376.1:c427", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.14,
xmax = chrn + blstw,
ymin = sstart,
ymax = send),
fill = "#228B22",
color = '#228B22'
)
# 5S
my_rows <- grep("5S_", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - blstw,
xmax = chrn - 0.1,
ymin = sstart,
ymax = send),
fill = "#000000",
color = '#000000'
)
my_rows <- grep("5S_", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.1,
xmax = chrn + blstw,
ymin = sstart,
ymax = send),
fill = "#000000",
color = '#000000'
)
# 45S
my_rows <- grep("45s_|26s_|5.8s_|18s_", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - blstw,
xmax = chrn - 0.1,
ymin = sstart,
ymax = send),
fill = "#B22222",
linewidth = 2,
# linewidth = 4,
color = '#B22222'
)
my_rows <- grep("45s_|26s_|5.8s_|18s_", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.1,
xmax = chrn + blstw,
ymin = sstart,
ymax = send),
fill = "#B22222",
linewidth = 2,
# linewidth = 4,
color = '#B22222'
)
# Centromeres, subtelomeric repeats
my_rows <- grep("AH3Ma_CS-237_satelite", ablstn$qseqid)
table(ablstn$qseqid[my_rows])
##
## AH3Ma_CS-237_satelite
## 16
table(ablstn$sseqid[my_rows])
##
## EH23a.chr5 EH23a.chr8 EH23a.chr9 EH23a.chrX
## 2 3 2 9
p2 <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - 0.5,
xmax = chrn - 0.2,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
my_rows <- grep("AH3Ma_CS-237_satelite", bblstn$qseqid)
table(bblstn$qseqid[my_rows])
##
## AH3Ma_CS-237_satelite
## 2
table(bblstn$sseqid[my_rows])
##
## EH23b.chr9
## 2
p2 <- p2 + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.2,
xmax = chrn + 0.5,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
p2
my_rows <- grep("CsatSD_centromere_237bp", ablstn$qseqid)
table(ablstn$qseqid[my_rows])
##
## CsatSD_centromere_237bp
## 2366
table(ablstn$sseqid[my_rows])
##
## EH23a.chr5 EH23a.chr6 EH23a.chr8
## 5 890 1471
p2 <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - 0.5,
xmax = chrn - 0.2,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
my_rows <- grep("CsatSD_centromere_237bp", bblstn$qseqid)
table(bblstn$qseqid[my_rows])
##
## CsatSD_centromere_237bp
## 3314
table(bblstn$sseqid[my_rows])
##
## EH23b.chr5 EH23b.chr6 EH23b.chr8
## 6 1575 1733
p2 <- p2 + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.2,
xmax = chrn + 0.5,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
p2
my_rows <- grep("CsatSD_centromere_370bp", ablstn$qseqid)
table(ablstn$qseqid[my_rows])
##
## CsatSD_centromere_370bp
## 8275
table(ablstn$sseqid[my_rows])
##
## EH23a.chr1 EH23a.chr2 EH23a.chr3 EH23a.chr4 EH23a.chr5 EH23a.chr6 EH23a.chr7
## 2080 480 1524 1608 357 1208 177
## EH23a.chr8 EH23a.chr9 EH23a.chrX
## 86 478 277
p2 <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - 0.5,
xmax = chrn - 0.2,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
my_rows <- grep("CsatSD_centromere_370bp", bblstn$qseqid)
table(bblstn$qseqid[my_rows])
##
## CsatSD_centromere_370bp
## 11483
table(bblstn$sseqid[my_rows])
##
## EH23b.chr1 EH23b.chr2 EH23b.chr3 EH23b.chr4 EH23b.chr5 EH23b.chr6 EH23b.chr7
## 1395 1135 1462 2311 296 2626 397
## EH23b.chr8 EH23b.chr9 EH23b.chrX
## 483 875 503
p2 <- p2 + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.2,
xmax = chrn + 0.5,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
p2
my_rows <- grep("AH3Ma_CS-237_satelite|CsatSD_centromere_237bp|CsatSD_centromere_370bp", ablstn$qseqid)
table(ablstn$qseqid[my_rows])
##
## AH3Ma_CS-237_satelite CsatSD_centromere_237bp CsatSD_centromere_370bp
## 16 2366 8275
table(ablstn$sseqid[my_rows])
##
## EH23a.chr1 EH23a.chr2 EH23a.chr3 EH23a.chr4 EH23a.chr5 EH23a.chr6 EH23a.chr7
## 2080 480 1524 1608 364 2098 177
## EH23a.chr8 EH23a.chr9 EH23a.chrX
## 1560 480 286
# blstw <- 0.48
p <- p + ggplot2::geom_rect(
data = ablstn[my_rows, ],
ggplot2::aes(
xmin = chrn - 0.5,
xmax = chrn - 0.2,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
my_rows <- grep("AH3Ma_CS-237_satelite|CsatSD_centromere_237bp|CsatSD_centromere_370bp", bblstn$qseqid)
table(bblstn$qseqid[my_rows])
##
## AH3Ma_CS-237_satelite CsatSD_centromere_237bp CsatSD_centromere_370bp
## 2 3314 11483
table(bblstn$sseqid[my_rows])
##
## EH23b.chr1 EH23b.chr2 EH23b.chr3 EH23b.chr4 EH23b.chr5 EH23b.chr6 EH23b.chr7
## 1395 1135 1462 2311 302 4201 397
## EH23b.chr8 EH23b.chr9 EH23b.chrX
## 2216 877 503
p <- p + ggplot2::geom_rect(
data = bblstn[my_rows, ],
ggplot2::aes(
xmin = chrn + 0.2,
xmax = chrn + 0.5,
ymin = sstart,
ymax = send),
fill = "#1E90FF",
color = '#1E90FF'
)
pEH23 <- p
pEH23
# #save(pEH23, file = "EH23ideo.RData")
#save(pEH23, chrom_wid, thinw, blstw , file = "EH23ideo.RData")
EH23a_wins[1:3, ]
## Id Win_number Start End Win_length A C G T
## 1 EH23a.chr1 0 0 999999 1000000 339510 159349 157789 343352
## 2 EH23a.chr1 1 1000000 1999999 1000000 346184 152100 152139 349577
## 3 EH23a.chr1 2 2000000 2999999 1000000 343278 155236 154692 346794
## CG CHG CHH chrn CGs iCGs gcnt gcntsc gcol
## 1 14003 19695 90544 1 0.07933312 0.9206669 156 1.0000000 #FECE91FF
## 2 13018 18441 88693 1 0.00000000 1.0000000 141 0.9038462 #FEA973FF
## 3 14100 19606 89229 1 0.08714562 0.9128544 123 0.7884615 #FA7D5EFF
my_df <- rbind(EH23a_wins, EH23b_wins)
my_df$hap <- sub("\\.chr.", "", my_df$Id)
my_df$chrn <- as.factor(my_df$chrn)
p <- ggplot( my_df, mapping = aes(x = chrn, y = gcnt, fill = chrn))
p <- p + geom_violin( adjust = 0.3 )
#p + geom_dotplot(binaxis='y', stackdir='center', dotsize=0.2)
p <- p + facet_grid(hap ~ .)
p <- p + theme_bw()
p <- p + theme(legend.position = "none")
p <- p + scale_fill_brewer(palette="Paired")
p <- p + ylab("Gene count")
p <- p + theme(axis.title.x=element_blank())
p
# my_df$chrn <- as.numeric(as.character(my_df$chrn))
#
# p <- ggplot( my_df, mapping = aes(x = chrn, y = gcnt, fill = chrn))
# p + geom_point()
#EH23a_wins$gcnt
#EH23a_wins$Id
t9 <- Sys.time()
t9 - t1
## Time difference of 1.079779 mins
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.14.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
t10 <- Sys.time()
#vcf <- read.vcfR("F2s.variants5.freebayes.filt.3.vcf.gz", nrows = )
#vcf <- read.vcfR("F2s_0NA.vcf.gz", nrows = )
#vcf <- read.vcfR("/media/knausb/Vining_lab/private_data/salk/VCFs/F2s_0NA.vcf.gz", nrows = )
vcf <- read.vcfR("/media/knausb/E737-9B48/releases/VCFs/F2s.variants7.freebayes.filt.3.vcf.gz")
vcf
## ***** Object of Class vcfR *****
## 271 samples
## 10 CHROMs
## 93,251 variants
## Object size: 611.5 Mb
## 0 percent missing data
## ***** ***** *****
#vcf <- vcf[is.biallelic(vcf), ]
#vcf
vcf@gt <- vcf@gt[, grep("Parent_23", colnames(vcf@gt), invert = TRUE)]
#colnames(vcf@gt)
vcf
## ***** Object of Class vcfR *****
## 270 samples
## 10 CHROMs
## 93,251 variants
## Object size: 601 Mb
## 0 percent missing data
## ***** ***** *****
vcf@fix[1:3, 1:8]
## CHROM POS ID REF ALT QUAL FILTER INFO
## [1,] "EH23b.chr1" "48433" NA "C" "T" "53274.2" "PASS" NA
## [2,] "EH23b.chr1" "54381" NA "T" "C" "9113.38" "PASS" NA
## [3,] "EH23b.chr1" "61575" NA "A" "G" "42650" "PASS" NA
# gt <- extract.gt(vcf)
# my_na <- apply(gt, MARGIN = 1, function(x){ sum( is.na(x) ) })
# #hist(my_na)
# range(my_na)
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr1" ] <- "chr1"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr5" ] <- "chr2"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr2" ] <- "chr3"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr3" ] <- "chr4"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr4" ] <- "chr5"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr7" ] <- "chr6"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr8" ] <- "chr7"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr9" ] <- "chr8"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chr6" ] <- "chr9"
# vcf@fix[, "CHROM"][ vcf@fix[, "CHROM"] == "EH23b.chrX" ] <- "chrX"
vcf@fix[, "CHROM"] <- sub("^EH23b.", "", vcf@fix[, "CHROM"])
unique(vcf@fix[, "CHROM"])
## [1] "chr1" "chr5" "chr2" "chr3" "chr4" "chr7" "chr8" "chr9" "chr6" "chrX"
t11 <- Sys.time()
t11 - t10
## Time difference of 54.94225 secs
t11 - t1
## Time difference of 2.019458 mins
#
nucs2 <- read.csv("~/gits/CannabisPangenome/FigureSideo/EH23b.softmasked_nuccomp.csv")
#
nucs <- read.csv("~/gits/CannabisPangenome/FigureSideo/EH23b.softmasked_nuccomp.csv")
#nucs <- read.csv("/media/knausb/Vining_lab/private_data/salk/ERBxHO40_23_v2_hap_ERB/nuccomp/ERBxHO40_23.combined.v2.hic.hap_ERB_nuccomp.csv")
cbind(nucs2$Id, nucs$Id)
## [,1] [,2]
## [1,] "EH23b.chr1" "EH23b.chr1"
## [2,] "EH23b.chr5" "EH23b.chr5"
## [3,] "EH23b.chr2" "EH23b.chr2"
## [4,] "EH23b.chr3" "EH23b.chr3"
## [5,] "EH23b.chr4" "EH23b.chr4"
## [6,] "EH23b.chr7" "EH23b.chr7"
## [7,] "EH23b.chr8" "EH23b.chr8"
## [8,] "EH23b.chr9" "EH23b.chr9"
## [9,] "EH23b.chr6" "EH23b.chr6"
## [10,] "EH23b.chrX" "EH23b.chrX"
# nucs$Id[ nucs$Id == "chr_1e" ] <- "chr1"
# nucs$Id[ nucs$Id == "chr_2e" ] <- "chr5"
# nucs$Id[ nucs$Id == "chr_3e" ] <- "chr2"
# nucs$Id[ nucs$Id == "chr_4e" ] <- "chr3"
# nucs$Id[ nucs$Id == "chr_5e" ] <- "chr4"
# nucs$Id[ nucs$Id == "chr_6e" ] <- "chr7"
# nucs$Id[ nucs$Id == "chr_7e" ] <- "chr8"
# nucs$Id[ nucs$Id == "chr_8e" ] <- "chr9"
# nucs$Id[ nucs$Id == "chr_9e" ] <- "chr6"
# nucs$Id[ nucs$Id == "chr_Xe" ] <- "chrX"
nucs$chrom_num <- sub(".+chr", "", nucs$Id)
nucs$chrom_num[ nucs$chrom_num == "X" ] <- 10
nucs$chrom_num <- as.numeric(nucs$chrom_num)
nucs <- nucs[sort.int(nucs$chrom_num, decreasing = FALSE, index.return = TRUE)$ix, ]
nucs$Id
## [1] "EH23b.chr1" "EH23b.chr2" "EH23b.chr3" "EH23b.chr4" "EH23b.chr5"
## [6] "EH23b.chr6" "EH23b.chr7" "EH23b.chr8" "EH23b.chr9" "EH23b.chrX"
vcf2 <- vcf[ vcf@fix[, "CHROM"] == "chr1", ]
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr2", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr3", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr4", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr5", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr6", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr7", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr8", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chr9", ])
vcf2 <- rbind2(vcf2, vcf[ vcf@fix[, "CHROM"] == "chrX", ])
vcf <- vcf2
rm(vcf2)
table(vcf@fix[, "CHROM"])
##
## chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX
## 7867 10764 12517 10240 7145 10768 8238 8902 7498 9312
chrom_name <- "chr4"
vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ] <-
EH23b$Length[ EH23b$Id == "EH23b.chr4" ] -
as.numeric(vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ])
chrom_name <- "chr5"
vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ] <-
EH23b$Length[ EH23b$Id == "EH23b.chr5" ] -
as.numeric(vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ])
chrom_name <- "chr8"
vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ] <-
EH23b$Length[ EH23b$Id == "EH23b.chr8" ] -
as.numeric(vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ])
chrom_name <- "chrX"
vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ] <-
EH23b$Length[ EH23b$Id == "EH23b.chrX" ] -
as.numeric(vcf@fix[ , "POS"][ vcf@fix[, "CHROM"] == chrom_name ])
nucs[1:3, ]
## Id Length a A c C g G
## 1 EH23b.chr1 63722626 14201549 7060523 7211614 3358522 7189944 3368219
## 3 EH23b.chr2 80157000 21489664 4919026 11451031 2314835 11452769 2317984
## 4 EH23b.chr3 83805000 22850264 4639830 12292599 2161091 12260323 2163874
## t T w W s S m M k K r R y Y n N chrom_num
## 1 14242915 7080840 0 0 0 0 0 0 0 0 0 0 0 0 0 8500 1
## 3 21311082 4896609 0 0 0 0 0 0 0 0 0 0 0 0 0 4000 2
## 4 22785943 4646076 0 0 0 0 0 0 0 0 0 0 0 0 0 5000 3
nucs$Id
## [1] "EH23b.chr1" "EH23b.chr2" "EH23b.chr3" "EH23b.chr4" "EH23b.chr5"
## [6] "EH23b.chr6" "EH23b.chr7" "EH23b.chr8" "EH23b.chr9" "EH23b.chrX"
nucs$ends <- 0
nucs$ends[1] <- nucs$Length[1]
for(i in 2:nrow(nucs)){
nucs$ends[i] <- nucs$ends[i-1] + nucs$Length[i]
}
nucs$mids <- nucs$ends - nucs$Length/2
nucs$Id <- paste("chr", nucs$chrom_num, sep = "")
nucs$Id[ nucs$Id == "chr10" ] <- "chrX"
nucs[1:3, ]
## Id Length a A c C g G t
## 1 chr1 63722626 14201549 7060523 7211614 3358522 7189944 3368219 14242915
## 3 chr2 80157000 21489664 4919026 11451031 2314835 11452769 2317984 21311082
## 4 chr3 83805000 22850264 4639830 12292599 2161091 12260323 2163874 22785943
## T w W s S m M k K r R y Y n N chrom_num ends mids
## 1 7080840 0 0 0 0 0 0 0 0 0 0 0 0 0 8500 1 63722626 31861313
## 3 4896609 0 0 0 0 0 0 0 0 0 0 0 0 0 4000 2 143879626 103801126
## 4 4646076 0 0 0 0 0 0 0 0 0 0 0 0 0 5000 3 227684626 185782126
my_pos <- rePOS(vcf, lens = nucs)
my_popsum <- gt.to.popsum(vcf)
#my_popsum <- as.matrix(my_popsum)
my_popsum[1:3, ]
## n Allele_counts He Ne
## 1 270 275,265 0.4998285 1.999314
## 2 270 387,153 0.4061111 1.683817
## 3 270 272,268 0.4999726 1.999890
my_popsum <- cbind(vcf@fix[ , 1:2 ], my_popsum)
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne
## 1 chr1 48433 270 275,265 0.4998285 1.999314
## 2 chr1 54381 270 387,153 0.4061111 1.683817
## 3 chr1 61575 270 272,268 0.4999726 1.999890
class(my_popsum)
## [1] "data.frame"
gt <- extract.gt(vcf)
t(apply(gt[1:4, ], MARGIN = 1, function(x){ table(x, useNA = 'always') }))
## 0/0 0/1 1/1 <NA>
## chr1_48433 65 145 60 0
## chr1_54381 118 151 1 0
## chr1_61575 70 132 68 0
## chr1_62404 60 205 5 0
nrow(gt) * 4
## [1] 373004
#, levels = c("0/0", "0/1", "1/1"))
#tmp <- apply(gt, MARGIN = 1, function(x){ table(x, useNA = 'always') })
tmp <- apply(gt, MARGIN = 1, function(x){
table(factor(x, levels = c("0/0", "0/1", "1/1", NA), exclude = NULL))
})
length(tmp)
## [1] 373004
range(lapply(tmp, length))
## [1] 1 1
tmp <- matrix(tmp, ncol = 4, byrow = TRUE)
colnames(tmp) <- c("count0.0", "count0.1", "count1.1", "countNA")
tmp[1:5, ]
## count0.0 count0.1 count1.1 countNA
## [1,] 65 145 60 0
## [2,] 118 151 1 0
## [3,] 70 132 68 0
## [4,] 60 205 5 0
## [5,] 60 145 65 0
my_popsum <- cbind(my_popsum, tmp)
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 2 chr1 54381 270 387,153 0.4061111 1.683817 118 151 1
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## countNA
## 1 0
## 2 0
## 3 0
tmp <- matrix(unlist(strsplit(my_popsum$Allele_counts, split = ",")), ncol = 2, byrow = TRUE)
colnames(tmp) <- c("ERBcount", "HO40count")
my_popsum <- cbind(my_popsum, tmp)
my_popsum$ERBcount <- as.numeric(my_popsum$ERBcount)
my_popsum$HO40count <- as.numeric(my_popsum$HO40count)
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 2 chr1 54381 270 387,153 0.4061111 1.683817 118 151 1
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## countNA ERBcount HO40count
## 1 0 275 265
## 2 0 387 153
## 3 0 272 268
class(my_popsum)
## [1] "data.frame"
my_popsum$ERBfreq <- my_popsum$ERBcount/(my_popsum[, "n"] * 2)
my_popsum$HO40freq <- my_popsum$HO40count/(my_popsum[, "n"] * 2)
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 2 chr1 54381 270 387,153 0.4061111 1.683817 118 151 1
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## countNA ERBcount HO40count ERBfreq HO40freq
## 1 0 275 265 0.5092593 0.4907407
## 2 0 387 153 0.7166667 0.2833333
## 3 0 272 268 0.5037037 0.4962963
my_popsum$He2 <- 2 * my_popsum$ERBfreq * my_popsum$HO40freq
my_popsum$POS2 <- my_pos
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 2 chr1 54381 270 387,153 0.4061111 1.683817 118 151 1
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## countNA ERBcount HO40count ERBfreq HO40freq He2 POS2
## 1 0 275 265 0.5092593 0.4907407 0.4998285 48433
## 2 0 387 153 0.7166667 0.2833333 0.4061111 54381
## 3 0 272 268 0.5037037 0.4962963 0.4999726 61575
# write.table(x = my_popsum, file = "allele_freqs.csv", sep = ",",
# row.names = FALSE, col.names = TRUE)
Hs <- 2 * my_popsum$ERBfreq * my_popsum$HO40freq
Ho <- my_popsum$count0.1 / my_popsum$n
my_popsum$Fis <- (Hs - Ho)/Hs
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 2 chr1 54381 270 387,153 0.4061111 1.683817 118 151 1
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## countNA ERBcount HO40count ERBfreq HO40freq He2 POS2 Fis
## 1 0 275 265 0.5092593 0.4907407 0.4998285 48433 -0.07444254
## 2 0 387 153 0.7166667 0.2833333 0.4061111 54381 -0.37710898
## 3 0 272 268 0.5037037 0.4962963 0.4999726 61575 0.02216857
#my_popsum <- my_popsum[seq(1, nrow(my_popsum), by = 2), ]
# hist(my_popsum$ERBfreq)
# hist(my_popsum$HO40freq)
#my_popsum <- my_popsum[my_popsum$ERBfreq >= 0.4, ]
#my_popsum <- my_popsum[my_popsum$ERBfreq <= 0.6, ]
#
#my_popsum <- my_popsum[my_popsum$HO40freq >= 0.4, ]
#my_popsum <- my_popsum[my_popsum$HO40freq <= 0.6, ]
my_min <- 0.3
my_max <- 0.7
my_popsum <- my_popsum[my_popsum$ERBfreq >= my_min, ]
my_popsum <- my_popsum[my_popsum$ERBfreq <= my_max, ]
#
my_popsum <- my_popsum[my_popsum$HO40freq >= my_min, ]
my_popsum <- my_popsum[my_popsum$HO40freq <= my_max, ]
my_min <- -0.5
my_max <- 0.5
my_popsum <- my_popsum[my_popsum$Fis >= my_min, ]
my_popsum <- my_popsum[my_popsum$Fis <= my_max, ]
par( mfrow = c(1, 2) )
#
hist(my_popsum$HO40freq, xlab = "", ylab = "Count", main = "HO40 allele frequency")
#
hist(my_popsum$ERBfreq, xlab = "", ylab = "Count", main = "ERB allele frequency")
par( mfrow = c(1, 1) )
barplot( table( my_popsum$CHROM ), las = 3 )
title( ylab = "Variant count" )
title( main = paste("Total Variant count:", format( nrow(my_popsum), big.mark = ",") ) )
my_popsum[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## 5 chr1 64079 270 265,275 0.4998285 1.999314 60 145 65
## countNA ERBcount HO40count ERBfreq HO40freq He2 POS2 Fis
## 1 0 275 265 0.5092593 0.4907407 0.4998285 48433 -0.07444254
## 3 0 272 268 0.5037037 0.4962963 0.4999726 61575 0.02216857
## 5 0 265 275 0.4907407 0.5092593 0.4998285 64079 -0.07444254
nrow(my_popsum)
## [1] 89166
vcf@fix[1:3, 1:8]
## CHROM POS ID REF ALT QUAL FILTER INFO
## [1,] "chr1" "48433" NA "C" "T" "53274.2" "PASS" NA
## [2,] "chr1" "54381" NA "T" "C" "9113.38" "PASS" NA
## [3,] "chr1" "61575" NA "A" "G" "42650" "PASS" NA
nrow(vcf)
## [1] 93251
tmp1 <- paste(getCHROM(vcf), getPOS(vcf), sep = "_")
tmp2 <- paste( my_popsum$CHROM, my_popsum$POS, sep = "_")
vcf <- vcf[tmp1 %in% tmp2, ]
vcf
## ***** Object of Class vcfR *****
## 270 samples
## 10 CHROMs
## 89,166 variants
## Object size: 556.4 Mb
## 0 percent missing data
## ***** ***** *****
# write.vcf(vcf, file = "tmsalk.vcf.gz")
afreq <- my_popsum
EH23a <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
EH23a$Frequency <- EH23a$ERBfreq
EH23a$facet1 <- "EH23a"
EH23b <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
EH23b$Frequency <- EH23a$HO40freq
EH23b$facet1 <- "EH23b"
Het <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
Het$Frequency <- EH23a$He2
Het$facet1 <- "Het"
Het[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## 5 chr1 64079 270 265,275 0.4998285 1.999314 60 145 65
## countNA ERBcount HO40count ERBfreq HO40freq He2 POS2 Frequency
## 1 0 275 265 0.5092593 0.4907407 0.4998285 48433 0.4998285
## 3 0 272 268 0.5037037 0.4962963 0.4999726 61575 0.4999726
## 5 0 265 275 0.4907407 0.5092593 0.4998285 64079 0.4998285
## facet1
## 1 Het
## 3 Het
## 5 Het
Fis_df <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
#Fis_df$Frequency <- Fis
Fis_df$Frequency <- afreq$Fis
Fis_df$facet1 <- "Fis"
#
afreq <- rbind(EH23a, EH23b, Het, Fis_df)
#afreq <- rbind(EH23a, EH23b, Het)
afreq$facet2 <- afreq$facet1
afreq$facet2 <- sub("EH23[ab]", "EH23", afreq$facet2)
afreq$facet2[ afreq$facet2 == "EH23" ] <- "Allele"
afreq$facet2[ afreq$facet2 == "Het" ] <- "Heterozygosity"
#table(afreq$facet1)
table(afreq$facet2)
##
## Allele Fis Heterozygosity
## 178332 89166 89166
afreq[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1 count1.1
## 1 chr1 48433 270 275,265 0.4998285 1.999314 65 145 60
## 3 chr1 61575 270 272,268 0.4999726 1.999890 70 132 68
## 5 chr1 64079 270 265,275 0.4998285 1.999314 60 145 65
## countNA ERBcount HO40count ERBfreq HO40freq He2 POS2 Frequency
## 1 0 275 265 0.5092593 0.4907407 0.4998285 48433 0.5092593
## 3 0 272 268 0.5037037 0.4962963 0.4999726 61575 0.5037037
## 5 0 265 275 0.4907407 0.5092593 0.4998285 64079 0.4907407
## facet1 facet2
## 1 EH23a Allele
## 3 EH23a Allele
## 5 EH23a Allele
t20 <- Sys.time()
t20 - t1
## Time difference of 2.962397 mins
library(ggplot2)
library(ggsci)
table(afreq$facet2)
##
## Allele Fis Heterozygosity
## 178332 89166 89166
#
afreq <- afreq[afreq$facet2 != "Heterozygosity", ]
#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.01)
#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
#my_pal <- viridisLite::viridis(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
my_pal <- RColorBrewer::brewer.pal(n=10, name = "Paired")
#my_pal <- RColorBrewer::brewer.pal(n=10, name = "Set3")
my_pal <- paste(my_pal, "04", sep = "")
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)
#as.numeric(as.factor(afreq$CHROM))
#afreq$col2 <- viridisLite::magma( n = 10, alpha = 0.8, begin = 0, end = 1 )[as.numeric(as.factor(afreq$CHROM))]
# range(afreq$POS2)
#afreq$facet2 <- factor(afreq$facet2, levels = c("Allele", "F[IS]"))
p <- ggplot( data = afreq,
mapping = aes( x = POS2,
#y = ERBfreq,
#y = freq,
y = Frequency,
color = CHROM )
#color = col2)
)
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + theme( axis.title.x=element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
# plot.margin = unit(c(0.1,0.1,1,0.1), "cm")
)
#p <- p + theme(axis.title.x = element_blank(),
# axis.text.x = element_text( angle = 60, hjust = 1))
# p + scale_x_discrete(breaks = EH23a$mids2, labels = EH23a$Id)
# p + scale_x_discrete(breaks = EH23a$mids2)
p <- p + ggplot2::scale_x_continuous(
breaks = nucs$mids,
expand = expansion(mult = 0.01, add = 0.0),
# expand = expansion(mult = 0.02, add = 0.0),
labels = sub("a.chr", ".chr", nucs$Id)
)
p <- p + theme( panel.grid.major.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )
p <- p + theme( panel.grid.minor.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )
# p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 1, color = "#808080", linewidth = 0.4 )
# p <- p + geom_vline( xintercept = 0, linetype = 1, color = "#808080", linewidth = 0.4 )
#p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 5, color = "#808080", linewidth = 0.4 )
p <- p + geom_vline( xintercept = nucs$ends, linetype = 5, color = "#808080", linewidth = 0.4 )
p <- p + geom_vline( xintercept = 0, linetype = 5, color = "#808080", linewidth = 0.4 )
#p <- p + geom_hline( yintercept = 0.5, linetype = 1, color = "#000000", linewidth = 1 )
#p <- p + geom_hline( yintercept = c(0, 0.5), linetype = 1, color = "#000000", linewidth = 1 )
#p + scale_color_manual( values = viridisLite::magma( n = 10, alpha = 0.01, begin = 0.2, end = 0.9 ) )
#p <- p + xlim(100, sum(nuccomp$Length))
#p <- p + ylim(0.3, 0.7)
#p <- p + scale_color_manual( values = my_pal )
p <- p + scale_color_npg( alpha = 0.03 )
#p <- p + scale_color_npg( alpha = 0.05 )
#
#p <-
#p + facet_grid(facet1 ~ ., scales = "free_y", labeller = label_parsed)
#p <-
#ahplot + ylab("")
# ahplot <- ahplot + scale_y_continuous( breaks = seq(-1.0, 1.0, by = 0.2),
# minor_breaks = seq(-1, 1, by = 0.1) )
ahplot <- p + facet_grid(facet2 ~ . , scales = "free_y", labeller = label_parsed)
#ahplot <- ahplot + theme( panel.grid.major.y = element_line(color = "#696969", linewidth = 0.6 ) )
ahplot <- ahplot + theme( panel.grid.major.y = element_line(color = "#A9A9A9", linewidth = 0.4 ) )
#ahplot + theme( panel.grid.major.y = element_line(color = "#C0C0C0", linewidth = 0.2, linetype = 1 ) )
# myPlots[[i]] <- myPlots[[i]] + theme( panel.grid.minor.y=element_line(color = "#C0C0C0", size=0.2) )
ahplot <- ahplot + ylab(NULL)
#p
#ahplot
#ahplot + scale_color_npg( alpha = 0.04 )
#fill = "#FFA500"
#fill = "#1E90FF"
# ahplot + scale_color_manual( values=c("#1E90FF", "#FFA500") )
ahplot <- ahplot + scale_color_manual( values = rep(c("#1E90FF04", "#FFA50004"), times = 5) )
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
ahplot
# save(ahplot, file = "ahplot.RData")
library(ggplot2)
library(ggsci)
table(afreq$facet2)
##
## Allele Fis
## 178332 89166
#afreq <- afreq[afreq$facet2 != "Heterozygosity", ]
my_pal <- RColorBrewer::brewer.pal(n=10, name = "Paired")
my_pal <- paste(my_pal, "04", sep = "")
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)
#afreq[1:3, ]
afreq$POS3 <- as.numeric(afreq$POS)
p <- ggplot( data = afreq[afreq$facet2 == "Allele", ],
mapping = aes( x = POS3,
#x = POS2,
#y = ERBfreq,
#y = freq,
y = Frequency,
group = CHROM,
color = CHROM )
#color = col2)
)
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + theme( axis.title.x=element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)
)
# p <- p + ggplot2::scale_x_continuous(
# breaks = nucs$mids,
# expand = expansion(mult = 0.01, add = 0.0),
# labels = sub("a.chr", ".chr", nucs$Id)
# )
# p <- p + theme( panel.grid.major.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )
# p <- p + theme( panel.grid.minor.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )
# p <- p + geom_vline( xintercept = nucs$ends, linetype = 5, color = "#808080", linewidth = 0.4 )
# p <- p + geom_vline( xintercept = 0, linetype = 5, color = "#808080", linewidth = 0.4 )
p <- p + scale_color_npg( alpha = 0.03 )
#p + facet_grid(facet2 ~ . , scales = "free_y", labeller = label_parsed)
# p + facet_grid( . ~ CHROM , scales = "free_x", labeller = label_parsed)
#p + facet_grid( CHROM ~ . , scales = "free_x", labeller = label_parsed)
#p + facet_grid( CHROM ~ . , scales = "free")
p + facet_grid( CHROM ~ . )
# ggsave( filename = "EH23_freqs.tiff",
# device = "tiff",
# width = 6.5,
# height = 6.5,
# #height = 9,
# units = "in",
# dpi = 300,
# compression = "lzw")
barplot(table(cut(afreq$POS3[afreq$CHROM == "chr5"], breaks = seq(0, 90e6, by = 1e6))))
ahplot <- p + facet_grid(facet2 ~ . , scales = "free_y", labeller = label_parsed)
#ahplot <- ahplot + theme( panel.grid.major.y = element_line(color = "#696969", linewidth = 0.6 ) )
ahplot <- ahplot + theme( panel.grid.major.y = element_line(color = "#A9A9A9", linewidth = 0.4 ) )
#ahplot + theme( panel.grid.major.y = element_line(color = "#C0C0C0", linewidth = 0.2, linetype = 1 ) )
# myPlots[[i]] <- myPlots[[i]] + theme( panel.grid.minor.y=element_line(color = "#C0C0C0", size=0.2) )
ahplot <- ahplot + ylab(NULL)
library("ggpubr")
ggarrange(
plotlist = list(pEH23, ahplot),
labels = c("A", "B"),
label.x = 0,
label.y = c(1, 1.1),
ncol = 1, nrow = 2,
widths = 1, heights = c(2, 1))
Figure 1. Genomic architecture of the Cannabis sativa genome.
# ggsave( filename = "EH23_chroms_freqs.tiff",
# device = "tiff",
# width = 6.5,
# height = 6.5,
# #height = 9,
# units = "in",
# dpi = 300,
# compression = "lzw")
# ggsave( filename = "EH23_chroms_freqs.png",
# device = "png",
# width = 6.5,
# height = 6.5,
# #height = 9,
# units = "in",
# dpi = 300)
#pEH23
# ggsave( filename = "EH23_chroms.png",
# device = "png",
# width = 6.5,
# height = 6.5,
# #height = 9,
# units = "in",
# dpi = 300)
#ahplot
# ggsave( filename = "EH23_freqs.png",
# device = "png",
# width = 6.5,
# height = 6.5,
# #height = 9,
# units = "in",
# dpi = 300)
t99 <- Sys.time()
t99 - t1
## Time difference of 3.548725 mins